import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from random import sample
import numpy as np

import warnings
warnings.filterwarnings('ignore')

#################################
# Set parameters
#################################

cv = 5

#################################
# Load data
#################################

# Load entities to get number of entities
entities = pd.read_csv('../data/temp/gst_dtm_entities_verbs.csv')
feature_number = len(list(entities))-1

# Load narratives
df = pd.read_csv('../data/temp/gst_dtm_narratives.csv')
features = list(df)[1:]
print('No of features -- total:', len(features))
# If reduction of feature no. desired, uncomment the following four lines
# features = sample(features,feature_number)
# print('No of features -- after sampling (adjusting for no. of entities)', len(features))
# features_to_keep = features + ['id']
# df = df[features_to_keep]

# Collapse by speaker
df['id'] = [i[0:-3] for i in list(df['id'])]
df = df.groupby(['id']).sum().reset_index()

# Metadata
meta = pd.read_csv('../data/temp/gst_speaker_metadata.csv')
meta['id'] = [i[0:-3] for i in list(meta['id'])]
meta.drop_duplicates(subset=['id'], inplace=True)
meta = meta[['id', 'republican']]
print(len(df))
df = df.merge(meta, on='id', how='left', indicator=True)
print(len(df))
print(df._merge.value_counts())
df.drop(columns=['_merge'], inplace=True)

#################################
# Get train test split
#################################
x_train, x_test, y_train, y_test = train_test_split(df[features], df['republican'],
                                                    test_size=0.25, random_state=101, stratify=df['republican'])

#################################
# Oversample minority class articles
#################################
random_sampler = RandomOverSampler()
x_train_res, y_train_res = random_sampler.fit_resample(x_train, y_train)

#################################
# Fit regularized logistic
# regression
#################################

lgr = LogisticRegression(random_state=0, max_iter=2000, solver='saga', penalty='l2')
parameters = {'C': np.arange(0.25, 1.5, 0.75)}
clf = GridSearchCV(lgr, parameters, cv=cv)
print('Fitting model...:')
clf.fit(x_train_res, y_train_res)
best_lgr = clf.best_estimator_
pd.to_pickle(best_lgr, '../models/partisanship_prediction_models/narratives_model.pkl')

# Calculate accuracy via cross-validation
print('######## Training data ########')
scores = cross_val_score(best_lgr, x_train, y_train, cv=cv)
print("Accuracy with CV: {0} (+/- {1})".format(scores.mean(), (scores.std() * 1.96)/np.sqrt(cv)))

# Predict for entire training dataset
predicted = best_lgr.predict(x_train)
x_train['predicted'] = predicted
x_train['republican'] = list(y_train)
correct_train = len(x_train[x_train.predicted == x_train.republican])
print("Accuracy without CV:", correct_train/len(x_train))
print("F1-score:", f1_score(x_train.republican, x_train.predicted))
print("Precision score:", precision_score(x_train.republican, x_train.predicted))
print("Recall score:", recall_score(x_train.republican, x_train.predicted))
x_train.to_csv('../models/partisanship_prediction_models/narratives_training_data_predictions.csv', index=False)

# Calculate accuracy via cross-validation
print('######## Test data ########')
scores = cross_val_score(best_lgr, x_test, y_test, cv=cv)
print("Accuracy with CV: {0} (+/- {1})".format(scores.mean(), (scores.std() * 1.96)/np.sqrt(cv)))

# Predict for entire test dataset
predicted = best_lgr.predict(x_test)
x_test['predicted'] = predicted
x_test['republican'] = list(y_test)
correct_test = len(x_test[x_test.predicted == x_test.republican])
print("Accuracy without CV:", correct_test/len(x_test))
print("F1-score:", f1_score(x_test.republican, x_test.predicted))
print("Precision score:", precision_score(x_test.republican, x_test.predicted))
print("Recall score:", recall_score(x_test.republican, x_test.predicted))
x_test.to_csv('../models/partisanship_prediction_models/narratives_test_data_predictions.csv', index=False)

# Extract features
coef_list = np.array(best_lgr.coef_).reshape(-1, ).tolist()
feature_coef = pd.DataFrame({'feature': features, 'coefficient': coef_list})
# Sort by importance of coefficient
feature_coef.sort_values(by=['coefficient'], ascending=False, inplace=True)
feature_coef.reset_index(drop=True, inplace=True)
# Save the result
feature_coef.to_csv('../models/partisanship_prediction_models/narratives_features.csv', index=False)
